!pip install datasets

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import re
from sklearn.model_selection import train_test_split, KFold
import pandas as pd
from transformers import pipeline

from google.colab import drive
drive.mount('/content/drive')
import os
#project_path = <a path>
os.chdir(project_path)

data1 = pd.read_csv('data1.csv')
data2 = pd.read_csv('data2.csv')

annotated_sample = pd.read_csv('annotated_data.csv')

train, val = train_test_split(annotated_sample, test_size=0.2, random_state=42)

sentiment_labels = {
    'Positive': 'This text is positive',
    'Neutral': 'This text is neutral',
    'Negative': 'This text is negative'
}

targeted_labels = {
    'Positive': lambda x: f'The author likes {x}',
    'Neutral': lambda x: f'The author is neutral to {x}',
    'Negative': lambda x: f'The author dislikes {x}'
}

train_expanded = train.loc[train.index.repeat(3)].reset_index(drop=True)
train_expanded['sentiment_label'] = ['Positive', 'Neutral', 'Negative'] * len(train)
train_expanded['targeted_sentiment_label'] = ['Positive', 'Neutral', 'Negative'] * len(train)
train_expanded['annotated_sentiment'] = (train_expanded['annotated_sentiment'] != train_expanded['sentiment_label']).astype(float)
train_expanded['annotated_targeted_sentiment'] = (train_expanded['annotated_targeted_sentiment'] != train_expanded['targeted_sentiment_label']).astype(float)
train_expanded['sentiment_label'] = train_expanded['sentiment_label'].map(sentiment_labels)
train_expanded['targeted_sentiment_label'] = train_expanded.apply(lambda row: targeted_labels[row['targeted_sentiment_label']](row['Receiver']), axis=1)

train_expanded_sentiment = train_expanded[["text_en","annotated_sentiment","sentiment_label"]]
train_expanded_targeted_sentiment = train_expanded[["text_en","annotated_targeted_sentiment","targeted_sentiment_label"]]
train_expanded_sentiment.rename(columns={'text_en': 'text', 'annotated_sentiment': 'labels','sentiment_label':'hypothesis'}, inplace=True)
train_expanded_targeted_sentiment.rename(columns={'text_en': 'text', 'annotated_targeted_sentiment': 'labels','targeted_sentiment_label':'hypothesis'}, inplace=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
#training_directory = <model directory>

train_expanded_sentiment = Dataset.from_pandas(train_expanded_sentiment)
train_expanded_targeted_sentiment = Dataset.from_pandas(train_expanded_targeted_sentiment)

model_name = "mlburnham/Political_DEBATE_base_v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(docs):
    return tokenizer(docs['text'], docs['hypothesis'], padding = 'max_length', truncation = True)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2, ignore_mismatched_sizes=True, id2label = {0:'entailment', 1:'not_entailment'})

training_args = TrainingArguments(output_dir=training_directory,
    logging_dir=f'{training_directory}/logs',
    lr_scheduler_type= "linear",
    group_by_length=False,
    report_to='none',
    learning_rate = 2e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 16,
    gradient_accumulation_steps = 1,
    num_train_epochs=5,
    warmup_ratio=0.06,
    weight_decay=0.01,
    fp16=False,
    fp16_full_eval=False,
    eval_strategy="no",
    seed=42,
    save_strategy="no",
    dataloader_num_workers = 4,
)

train_expanded_sentiment_tok = train_expanded_sentiment.map(tokenize_function)
train_expanded_targeted_sentiment_tok = train_expanded_targeted_sentiment.map(tokenize_function)

trainer1 = Trainer(
    model = model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_expanded_sentiment_tok
    )

trainer2 = Trainer(
    model = model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_expanded_targeted_sentiment_tok
    )

trainer1.train()
trainer2.train()

trainer1.save_model(f'{training_directory}/sentiment_classifier_base')
trainer2.save_model(f'{training_directory}/targeted_sentiment_classifier_base')

gen_base = pipeline("zero-shot-classification", model="mlburnham/Political_DEBATE_base_v1.0", device = device, batch_size = 32)
sent_base = pipeline("zero-shot-classification", model=f'{training_directory}/sentiment_classifier_base', device = device, batch_size = 32)
targ_sent_base = pipeline("zero-shot-classification", model=f'{training_directory}/targeted_sentiment_classifier_base', device = device, batch_size = 32)

def hypothesis_to_label(hypothesis):
  if re.search(r"This text is positive|The author likes",hypothesis):
    return "Positive"
  if re.search(r"This text is neutral|The author is neutral to",hypothesis):
    return "Neutral"
  if re.search(r"This text is negative|The author dislikes",hypothesis):
    return "Negative"

def predict_sentiment(text,pipeline):
  sentiment_labels = ['This text is positive',
                      'This text is neutral',
                      'This text is negative']

  res = pipeline(text,sentiment_labels)
  hypothesis = res["labels"][0]
  return hypothesis_to_label(hypothesis)

def predict_targeted_sentiment(text,username,pipeline):
  targeted_sentiment_labels = [f"The author likes {username}",
                               f"The author is neutral to {username}",
                               f"The author dislikes {username}"]

  res = pipeline(text,targeted_sentiment_labels)
  hypothesis = res["labels"][0]
  return hypothesis_to_label(hypothesis)

classifier = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment", tokenizer="cardiffnlp/twitter-xlm-roberta-base",device=0)

preds = classifier(list(val["text"]),return_all_scores=False)

val["sentiment_pred1"] = [entry['label'].capitalize() for entry in preds]
val["sentiment_pred2"] = val.apply(lambda row: predict_sentiment(row["text_en"], gen_base), axis=1)
val["sentiment_pred3"] = val.apply(lambda row: predict_sentiment(row["text_en"], sent_base), axis=1)

val["targeted_sentiment_pred1"] = [entry['label'].capitalize() for entry in preds]
val["targeted_sentiment_pred2"] = val.apply(lambda row: predict_targeted_sentiment(row["text_en"], row["Receiver"], gen_base), axis=1)
val["targeted_sentiment_pred3"] = val.apply(lambda row: predict_targeted_sentiment(row["text_en"], row["Receiver"], targ_sent_base), axis=1)

rank_mapping = {'Negative': -1, 'Neutral': 0, 'Positive': 1}

metrics = {
    'Accuracy': accuracy_score,
    'F1 Score': lambda y_true, y_pred: f1_score(y_true, y_pred, average='macro')
}


results = {}
tasks = {
    'annotated_sentiment': ['sentiment_pred1','sentiment_pred2','sentiment_pred3'],
    'annotated_targeted_sentiment': ['targeted_sentiment_pred1','targeted_sentiment_pred2','targeted_sentiment_pred3']
}

for task, preds in tasks.items():
    results[task] = {}
    for pred in preds:
        results[task][pred] = {
            metric_name: metric_func(val[task], val[pred])
            for metric_name, metric_func in metrics.items()
        }


summary_df = pd.DataFrame({
    (task, metric): [results[task][pred][metric] for pred in tasks[task]]
    for task in tasks
    for metric in metrics.keys()
}).T

summary_df.index.names = ['Task', 'Metric']
summary_df.columns = ["xlm-roberta-sentiment",'debate base','debate base (fine-tuned)']

# get model performance
summary_df

from tqdm.notebook import tqdm
tqdm.pandas()

def safe_predict_sentiment(row, model):
    try:
        return predict_sentiment(row['text_en'], model)
    except Exception as e:
        print(f"Error processing row: {e}")
        return None  # Or you could return a default value or error code

def safe_predict_targeted_sentiment(row, model,target):
    try:
        return predict_targeted_sentiment(row['text_en'], row[target], model)
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

def process_in_batches(dataframe, function, batch_size=100):
    # Store results
    results = []

    # Process in batches
    for i in tqdm(range(0, len(dataframe), batch_size)):
        batch = dataframe.iloc[i:i+batch_size]
        result = batch.apply(function, axis=1)
        results.append(result)

    # Concatenate results and return
    return pd.concat(results)

data1['predicted_targeted_sentiment'] = process_in_batches(data1, lambda row: safe_predict_targeted_sentiment(row, targ_sent_base,"Receiver"))
data1['predicted_sentiment'] = process_in_batches(data1, lambda row: safe_predict_sentiment(row, sent_base))

data1.to_csv('data1.csv', index=False)

data2['predicted_targeted_sentiment'] = process_in_batches(data2, lambda row: safe_predict_targeted_sentiment(row, targ_sent_base,"Receiver_full_name"))
data2['predicted_sentiment'] = process_in_batches(data2, lambda row: safe_predict_sentiment(row, sent_base))

data2.to_csv('data2.csv', index=False)

"""# Generate Out-Of-Sample predictions for annotated data"""

#training_directory = <another directory>

kf = KFold(n_splits=8, shuffle=True, random_state=42)
annotated_sample["predicted_sentiment"] = np.nan
annotated_sample["predicted_targeted_sentiment"] = np.nan
annotated_sample = annotated_sample.reset_index(drop=True)

i_temp = 0
for train_index, test_index in kf.split(annotated_sample):
    i_temp = i_temp + 1
    print(i_temp)
    train_data = annotated_sample.iloc[train_index]
    test_data = annotated_sample.iloc[test_index]

    train_expanded = train_data.loc[train_data.index.repeat(3)].reset_index(drop=True)
    train_expanded['sentiment_label'] = ['Positive', 'Neutral', 'Negative'] * len(train_data)
    train_expanded['targeted_sentiment_label'] = ['Positive', 'Neutral', 'Negative'] * len(train_data)
    train_expanded['annotated_sentiment'] = (train_expanded['annotated_sentiment'] != train_expanded['sentiment_label']).astype(float)
    train_expanded['annotated_targeted_sentiment'] = (train_expanded['annotated_targeted_sentiment'] != train_expanded['targeted_sentiment_label']).astype(float)

    train_expanded['sentiment_label'] = train_expanded['sentiment_label'].map(sentiment_labels)
    train_expanded['targeted_sentiment_label'] = train_expanded.apply(lambda row: targeted_labels[row['targeted_sentiment_label']](row['Receiver']), axis=1)

    train_expanded_sentiment = train_expanded[["text_en","annotated_sentiment","sentiment_label"]]
    train_expanded_targeted_sentiment = train_expanded[["text_en","annotated_targeted_sentiment","targeted_sentiment_label"]]

    train_expanded_sentiment.rename(columns={'text_en': 'text', 'annotated_sentiment': 'labels','sentiment_label':'hypothesis'}, inplace=True)
    train_expanded_targeted_sentiment.rename(columns={'text_en': 'text', 'annotated_targeted_sentiment': 'labels','targeted_sentiment_label':'hypothesis'}, inplace=True)

    train_expanded_sentiment = Dataset.from_pandas(train_expanded_sentiment)
    train_expanded_targeted_sentiment = Dataset.from_pandas(train_expanded_targeted_sentiment)

    train_expanded_sentiment_tok = train_expanded_sentiment.map(tokenize_function)
    train_expanded_targeted_sentiment_tok = train_expanded_targeted_sentiment.map(tokenize_function)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True, id2label={0:'entailment', 1:'not_entailment'})

    trainer1 = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_expanded_sentiment_tok
    )

    trainer2 = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_expanded_targeted_sentiment_tok
    )

    trainer1.train()
    trainer2.train()

    trainer1.save_model(f'{training_directory}/sentiment_classifier_fold')
    trainer2.save_model(f'{training_directory}/targeted_sentiment_classifier_fold')

    sent_base = pipeline("zero-shot-classification", model=f'{training_directory}/sentiment_classifier_fold', device=device, batch_size=32)
    targ_sent_base = pipeline("zero-shot-classification", model=f'{training_directory}/targeted_sentiment_classifier_fold', device=device, batch_size=32)

    annotated_sample.loc[test_index, "predicted_sentiment"] = annotated_sample.loc[test_index].apply(lambda row: predict_sentiment(row["text_en"], sent_base), axis=1)
    annotated_sample.loc[test_index, "predicted_targeted_sentiment"] = annotated_sample.loc[test_index].apply(lambda row: predict_targeted_sentiment(row["text_en"], row["Receiver"], targ_sent_base), axis=1)

annotated_sample.to_csv('annotated_data.csv', index=False)

"""## Emotions"""

!pip install pytorch_lightning

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from torchmetrics import F1Score
from torchmetrics.functional import accuracy, auroc #F1Score #f1
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from transformers import AutoTokenizer, DebertaV2Model, get_linear_schedule_with_warmup
from torch.optim import AdamW

import tqdm

import pandas as pd

LABEL_COLUMNS = ['anger_v2', 'fear_v2', 'disgust_v2', 'sadness_v2', 'joy_v2', 'enthusiasm_v2', 'pride_v2', 'hope_v2']
BASE_MODEL_NAME = "microsoft/mdeberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
batch_size = 8
device = "cuda" if torch.cuda.is_available() else "cpu"

class CrowdCodedTagger(pl.LightningModule):

  def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
    super().__init__()
    self.bert = DebertaV2Model.from_pretrained(BASE_MODEL_NAME, return_dict=True)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCELoss()

  def forward(self, input_ids, attention_mask, labels=None, token_type_ids=None):
    output = self.bert(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.last_hidden_state[:, 0])
    output = torch.sigmoid(output)
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def training_epoch_end(self, outputs):

    labels = []
    predictions = []
    for output in outputs:
      for out_labels in output["labels"].detach().cpu():
        labels.append(out_labels)
      for out_predictions in output["predictions"].detach().cpu():
        predictions.append(out_predictions)

    labels = torch.stack(labels).int()
    predictions = torch.stack(predictions)

    for i, name in enumerate(LABEL_COLUMNS):
      class_roc_auc = auroc(predictions[:, i], labels[:, i])
      self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)

  def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=2e-5) #DEFINING LEARNING RATE

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=self.n_warmup_steps,
      num_training_steps=self.n_training_steps
    )

    return dict(
      optimizer=optimizer,
      lr_scheduler=dict(
        scheduler=scheduler,
        interval='step'
      )
    )

def predict_labels(df):
    input_text = df['text'].tolist()
    num_inputs = len(input_text)
    num_batches = (num_inputs - 1) // batch_size + 1
    batched_input = [input_text[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)]
    batched_output = []

    for i, batch in enumerate(tqdm.tqdm(batched_input)):
        encoded_input = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors='pt')
        outputs = model(**encoded_input.to(device))
        tensor_values = outputs[1].tolist()
        decimal_numbers = [[num for num in sublist] for sublist in tensor_values]
        output_df = pd.DataFrame(decimal_numbers, columns=LABEL_COLUMNS)
        input_df = df.iloc[i * batch_size:(i + 1) * batch_size].reset_index(drop=True)
        output_df = pd.concat([input_df, output_df], axis=1)
        batched_output.append(output_df)

    output_df = pd.concat(batched_output, ignore_index=True)

    return output_df

model = CrowdCodedTagger(n_classes=8)
model.load_state_dict(torch.load("pol_emo_mDeBERTa/model/pytorch_model.pt"), strict = False)
model.to(device)
model.eval()

data1 = predict_labels(data1)

data1.to_csv('data1.csv', index=False)